07-mle-and-mm

Maximum likelihood (MLE) and method of moments (MM) are two common methods for constructing a model. For this assignment, we are going to model the unknown distributions with maximum likelihood and method of moments.

Data and preparation

library(tidyverse)
require(dplyr)
library(stats4)
Hmisc::getHdata(nhgh)
d1 <- nhgh %>% 
  filter(sex == "female") %>% 
  filter(age >= 18) %>% 
  select(gh, ht) %>% 
  filter(1:n()<=1000)

Glycohemoglobin (MLE)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

gh <- d1$gh
fun <- function(a, b)
  sum(-log(dnorm(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 6, b = 1))
## Warning in dnorm(gh, a, b): NaNs produced

## Warning in dnorm(gh, a, b): NaNs produced

## Warning in dnorm(gh, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 5.724595
b
##        b 
## 1.051692

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dnorm(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qnorm

x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, a, b)
## [1] 5.724595

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.05184657

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

gh <- d1$gh
fun <- function(a, b)
  sum(-log(dgamma(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 40, b = 7))
## Warning in dgamma(gh, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 40.81883
b
##        b 
## 7.130414

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dgamma(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qgamma

x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, a, b)
## [1] 5.677929

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.04385128

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

gh <- d1$gh
fun <- function(a, b)
  sum(-log(dweibull(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 4, b = 6))
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 4.125254
b
##        b 
## 6.173884

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dweibull(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qweibull

x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, a, b)
## [1] 5.64902

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.07738035

Glycohemoglobin (MM)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
mu <- xb
sigma <- sqrt(s2)
mu
## [1] 5.7246
sigma
## [1] 1.052246

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dnorm(x, mu, sigma), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, mu, sigma), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qnorm

x <- q_candidate((1:200)/200, mu, sigma)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, mu, sigma)
## [1] 5.7246

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, mu, sigma)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.05142749

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
lh <- xb/s2
ch <- xb^2/s2
lh
## [1] 5.170237
ch
## [1] 29.59754

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dgamma(x, ch, lh), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, ch, lh), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qgamma

x <- q_candidate((1:200)/200, ch, lh)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, ch, lh)
## [1] 5.660259

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, ch, lh)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.05112776

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
cv2 = s2/xb^2
fun <- function(beta, cv2){
  abs(gamma(1 + 2/beta)/(gamma(1 + 1/beta))^2 - (1 + cv2))
}
k = optimize(fun, interval = c(0, 100), cv2 = cv2, maximum = FALSE)$minimum
lambda = xb/gamma(1 + 1/k)
k
## [1] 6.353198
lambda
## [1] 6.151307

Then, we are going to overlay the estimated pdf onto histogram.

hist(gh, breaks = 20, freq = FALSE)
curve(dweibull(x, k, lambda), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, k, lambda), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- gh
q_candidate <- qweibull

x <- q_candidate((1:200)/200, k, lambda)
y <- quantile(gh, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, k, lambda)
## [1] 5.806483

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, k, lambda)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##      97.5% 
## 0.05162193

Height of adult females (MLE)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

ht <- d1$ht
fun <- function(a, b)
  sum(-log(dnorm(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 160, b = 10))
## Warning in dnorm(ht, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 160.7412
b
##        b 
## 7.315558

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dnorm(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qnorm

x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, a, b)
## [1] 160.7412

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3591797

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

ht <- d1$ht
fun <- function(a, b)
  sum(-log(dgamma(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 450, b = 2))
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 480.4147
b
##        b 
## 2.988733

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dgamma(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qgamma

x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, a, b)
## [1] 160.6304

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##    97.5% 
## 0.361636

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

ht <- d1$ht
fun <- function(a, b)
  sum(-log(dweibull(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 20, b = 160))
a <- coef(z)[1]
b <- coef(z)[2]
a
##        a 
## 21.85402
b
##        b 
## 164.2472

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dweibull(x, a, b), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qweibull

x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, a, b)
## [1] 161.5156

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, a, b)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.4170276

Glycohemoglobin (MM)

Normal distribution

In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.

ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
mu <- xb
sigma <- sqrt(s2)
mu
## [1] 160.7419
sigma
## [1] 7.320161

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dnorm(x, mu, sigma), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, mu, sigma), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qnorm

x <- q_candidate((1:200)/200, mu, sigma)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qnorm(0.5, mu, sigma)
## [1] 160.7419

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rnorm(1e4, mu, sigma)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3602376

Gamma distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.

ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
lh <- xb/s2
ch <- xb^2/s2
lh
## [1] 2.999769
ch
## [1] 482.1886

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dgamma(x, ch, lh), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, ch, lh), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qgamma

x <- q_candidate((1:200)/200, ch, lh)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qgamma(0.5, ch, lh)
## [1] 160.6308

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rgamma(1e4, ch, lh)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3603169

Weibull distribution

In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.

ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
cv2 = s2/xb^2
fun <- function(beta, cv2){
  abs(gamma(1 + 2/beta)/(gamma(1 + 1/beta))^2 - (1 + cv2))
}
k = optimize(fun, interval = c(0, 100), cv2 = cv2, maximum = FALSE)$minimum
lambda = xb/gamma(1 + 1/k)
k
## [1] 27.45941
lambda
## [1] 163.9807

Then, we are going to overlay the estimated pdf onto histogram.

hist(ht, breaks = 20, freq = FALSE)
curve(dweibull(x, k, lambda), add = TRUE)

Overlay the estimated cdf onto the ecdf

plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, k, lambda), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)

QQ plot

random_sample <- ht
q_candidate <- qweibull

x <- q_candidate((1:200)/200, k, lambda)
y <- quantile(ht, probs = (1:200)/200)

tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)

Estimated median

qweibull(0.5, k, lambda)
## [1] 161.8065

Median sampling distribution

median <- c()
for (i in 1:1e5) {
  data <- rweibull(1e4, k, lambda)
  median <- c(median, median(data))
}

hist(median, breaks = 20, freq = FALSE)

Range of middle 95% of sampling distribution

range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
##     97.5% 
## 0.3318572